import pandas as pd
import plotly.express as pxRegression Analysis on Climate Data
1 Importing Necessary Liberaries
2 Load the dataset
# Load the dataset
df = pd.read_csv('./processed_data/NABR_historic.csv')
df.drop(columns=['TimePeriod', 'RCP'], axis=1, inplace=True)
df.head()| long | lat | year | scenario | treecanopy | Ann_Herb | Bare | Herb | Litter | Shrub | ... | PPT_Annual | T_Winter | T_Summer | T_Annual | Tmax_Summer | Tmin_Winter | VWC_Winter_whole | VWC_Spring_whole | VWC_Summer_whole | VWC_Fall_whole | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -110.0472 | 37.60413 | 1980 | sc1 | 0 | 0 | 84 | 5 | 11 | 7 | ... | 13.79 | 0.964835 | 23.15924 | 23.159240 | 37.05 | NaN | NaN | NaN | NaN | NaN |
| 1 | -110.0472 | 37.60413 | 1980 | sc1 | 0 | 0 | 84 | 5 | 11 | 7 | ... | 2.69 | 0.964835 | 23.15924 | 0.964835 | 37.05 | NaN | NaN | NaN | NaN | NaN |
| 2 | -110.0472 | 37.60413 | 1980 | sc1 | 0 | 0 | 84 | 5 | 11 | 7 | ... | 13.79 | 0.964835 | 23.15924 | 0.964835 | 37.05 | NaN | NaN | NaN | NaN | NaN |
| 3 | -110.0472 | 37.60413 | 1980 | sc1 | 0 | 0 | 84 | 5 | 11 | 7 | ... | 2.69 | 0.964835 | 23.15924 | 23.159240 | 37.05 | NaN | NaN | NaN | NaN | NaN |
| 4 | -110.0472 | 37.60413 | 1980 | sc1 | 0 | 0 | 84 | 5 | 11 | 7 | ... | NaN | NaN | NaN | NaN | NaN | -12.45 | 0.113447 | 0.096831 | 0.041876 | 0.052298 |
5 rows × 27 columns
2.1 Missing Values and Data Types
import seaborn as sns
import matplotlib.pyplot as plt
# Visualize missing values
plt.figure(figsize=(15, 10))
sns.heatmap(df.drop(columns=['lat', 'long', 'year', 'scenario','treecanopy', 'Ann_Herb', 'Bare', 'Litter', 'Shrub'], axis=1).isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap', fontsize=16, fontweight='bold', loc='left')
plt.xticks(rotation=45)
plt.show()3 Data Preprocessing
# Assuming 'lat' and 'long' are the column names for latitude and longitude in your dataset
# Creating an interactive map to visualize the park location
fig = px.scatter_geo(df,
lat='lat',
lon='long',
scope='usa',
title='Utah National Parks Geographic Overview')
fig.update_geos(projection_type="natural earth")
fig.update_layout(showlegend=True)
fig.show()4 EDA
import pandas as pd
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)
# Assuming df is your DataFrame
# Engineering features
df['Annual_Soil_Moisture'] = df[['VWC_Winter_whole', 'VWC_Spring_whole', 'VWC_Summer_whole', 'VWC_Fall_whole']].mean(axis=1)
df['Annual_Temperature'] = df['T_Annual']
df['Annual_Precipitation'] = df['PPT_Annual']
vegetation_variables = ['treecanopy', 'Ann_Herb', 'Bare', 'Herb', 'Litter', 'Shrub']
# Set up the matplotlib figure
for veg_var in vegetation_variables:
# Precipitation vs Vegetation
plt.figure(figsize=(8, 5.5))
sns.scatterplot(data=df, x='Annual_Precipitation', y=veg_var, color='#E8B989')
plt.title(f'Annual Precipitation vs {veg_var}')
plt.xlabel('Annual Precipitation (mm)')
plt.ylabel(f'{veg_var} Coverage')
plt.show()
# Temperature vs Vegetation
plt.figure(figsize=(8, 5.5))
sns.scatterplot(data=df, x='Annual_Temperature', y=veg_var, color='#E8B989')
plt.title(f'Annual Temperature vs {veg_var}')
plt.xlabel('Annual Temperature (°C)')
plt.ylabel(f'{veg_var} Coverage')
plt.show()
# Soil Moisture vs Vegetation
plt.figure(figsize=(8, 5.5))
sns.scatterplot(data=df, x='Annual_Soil_Moisture', y=veg_var, color='#E8B989')
plt.title(f'Annual Soil Moisture vs {veg_var}')
plt.xlabel('Annual Soil Moisture')
plt.ylabel(f'{veg_var} Coverage')
plt.show()